Set up

using <- function(...) {
    libs <- unlist(list(...))
    need <- libs <- libs[!unlist(lapply(libs, require, character.only = T))]
    if(length(need) > 0){ 
        install.packages(need, repos = "https://cloud.r-project.org")
        need <- need[!unlist(lapply(need, require, character.only = T))]
        if (length(need) > 0) {
          if (!requireNamespace("BiocManager", quietly = T))
            install.packages("BiocManager")
          BiocManager::install(need)
        }
    }
    lapply(libs, require, character.only = T)
}
using("data.table", "tidyverse", "lattice", "gridExtra",
      "rtracklayer", "DiffBind", "idr2d", "patchwork")

IDR

We read in MACS peak calls and select coordinate columns as well as the p-value column, then run IDR

idr <- c('KO','KO2') %>%
  paste0('_H3K27ac_peaks.narrowPeak') %>%
  lapply(fread, select = c(1:3,8)) %>%
  {estimate_idr1d(.[[1]], .[[2]], value_transformation = 'identity')} %>%
  .$rep1_df

ggplot(idr, aes(x = rank, y = rep_rank, color = idr)) +
  geom_point(size = .1) + scale_color_gradientn(colors = rainbow(10)) +
  facet_grid(.~'rank') + theme(legend.position = 'none') +
  ggplot(idr, aes(x = value, y = rep_value, color = idr)) +
  geom_point(size = .1) + scale_color_gradientn(colors = rainbow(10)) +
  facet_grid(.~'value')


Peak callers

We’ll take the outputs of different peak callers and compare them


Import

You could take the raw outputs and re-arrange to more a more friendly form

You don’t need to run this chunk in class

pks <- lapply(list.files('peaks', full.names = T), function(x) {
  fread(x) %>% 
    mutate(diff = .[[3]] - .[[2]]) %>%
    pull(diff)
}) %>%
  setNames(list.files('peaks')) %>%
  melt() %>%
  mutate(pheno = sub('_.*', '', L1),
         mark = sub('^[^_]*_(.*)(_peaks.*|\\.txt)', '\\1', L1),
         caller = case_when(
           grepl('txt', L1) ~ "SICER",
           grepl('narrow', L1) ~ "MACS2 narrow",
           grepl('broad', L1) ~ "MACS2 broad",
           TRUE ~ "NA"))

Which has already been done and can be found in the provided file

load('lec10.RData')

Compare

We can then perform some basic comparisons

# size of individual intervals
ggplot(pks, aes(x = pheno, y = value, fill = caller)) +
  labs(x = "Condition", y = "Peak/domain size") +
  geom_violin(position = position_dodge(.8)) + 
  geom_pointrange(stat = "summary",
                  fun.min = function(v) quantile(v, .25),
                  fun.max = function(v) quantile(v, .75),
                  fun = median, fatten = .8,
                  position = position_dodge(.8)) +
  scale_y_log10() + facet_grid(. ~ mark) +
  theme(legend.position = 'bottom') 


# total size covered by enriched intervals
pks %>%
  group_by(pheno, mark, caller) %>%
  summarise(bases = sum(value)) %>%
  ggplot(aes(x = pheno, y = bases, fill = caller)) +
  labs(x = "Phenotype", y = "Bases in peaks") +
  geom_col(position = "dodge") + 
  scale_y_log10(expand = expansion(c(0,.05))) +
  facet_grid(. ~ mark) +
  theme(legend.position = 'bottom')


Count matrix

We can take the called peaks and merge them to obtain a reference set of genomic intervals – into which we can count reads into just as we’ve done with genes. This can be done easily with DiffBind using a sample sheet containing the locations and BAM / peak files as well as some metadata.

You don’t need to run this chunk in class

dat <- dba(sampleSheet = "samples.csv")
dat <- dba.count(dat)

But since the counting process takes a bit of time, we’ll start with the pre-computed object (already loaded from lec10.RData). As with transcriptomic and methylomic matrices, we can again perform routine analyses such as PCA and hierchical clustering

PCA

dba.plotPCA(dat, label = DBA_CONDITION)


Correlation

dba.plotHeatmap(dat, correlations = T)


Hierarchical clustering

dba.plotHeatmap(dat, correlations = F)


Genome-wide summaries

When we have many regions of interest across the genome, we can assess the distribution of signals surrounding such areas and evaluate the overall trend. One tool for performing this is deepTools, and we can examine the results of their computeMatrix module.

You don’t need to run this chunk in class

library(parallel)
mats <- mclapply(list.files('mats', full.names = T), function(x)
  colMeans(fread(x, skip = 1, header = F, drop = 1:6),
           na.rm = T), mc.cores = 12) %>%
  setNames(list.files('mats'))
mat.ex <- fread('mats/KO_H3K27ac.rx.promoter.mat.gz', skip = 1, header = F)

Heatmap

We’ll first take the KO sample’s H3K27ac enrichment surrounding promoters as an example

# use only the matrix part, throw away the coordinates info
dat.ex <- mat.ex[,-(1:6)] %>%
  arrange(rowMeans(.)) %>%
  as.matrix

# plot
levelplot(t(dat.ex), useRaster = T, xlab = "Position", ylab = "Interval",
          scales = list(draw = F), col.regions = clrs,
          at = seq(-2, 3, length.out = 100), aspect = "fill")


Including expression

Given the link between promoter acetylation and the cognate gene’s expression, we further incorporate RNA-seq data. Again, for the sake of time, it’s been imported and provided in lec10.RData

You don’t need to run this chunk in class

exps <- fread('BT245-KO-C4.UCSC.featureCounts.txt.counts')
library(biomaRt)
ensembl <- useMart(biomart = "ENSEMBL_MART_ENSEMBL",
                   host = "grch37.ensembl.org",
                   path = "/biomart/martservice",
                   dataset = "hsapiens_gene_ensembl")
res <- getBM(attributes = c("external_gene_name", "chromosome_name",
                            "start_position", "end_position"),
             filters = "external_gene_name", values = exps$Geneid,
             mart = ensembl)
dict <- left_join(res[!duplicated(res$external_gene_name) &
                        res$external_gene_name %in% exps$Geneid,],
                  exps[exps$Geneid %in% res$external_gene_name],
                  by = c("external_gene_name" = "Geneid")) %>%
  filter(chromosome_name %in% c(1:22, "X", "Y"))

With the expresion data in hand, we now combine the two modalities

# get promoters intervals from matrix
pmts <- makeGRangesFromDataFrame(mat.ex,
                                 seqnames.field = 'V1',
                                 start.field = 'V2',
                                 end.field = 'V3')

# get gene intervals overlapping promoters
dict$chr <- paste0("chr", dict$chromosome_name)
genes <- makeGRangesFromDataFrame(dict,
                                  start.field = "start_position",
                                  end.field = "end_position",
                                  seqnames.field = "chr",
                                  keep.extra.columns = T)
pmts.ol <- pmts[overlapsAny(pmts, genes)]
genes.ol <- genes[overlapsAny(genes, pmts)]

# partition genes into quintiles by expression
ranks <- rank(genes.ol$counts/abs((end(genes.ol) - start(genes.ol))),
              ties.method = "first")
genes.ol$cut <- as.numeric(cut(ranks, quantile(ranks, probs = seq(0, 1, 0.2)),
                       include.lowest = TRUE))
pmts.cut <- data.frame(V4 = sprintf("%s:%d-%d", seqnames(pmts.ol),
                                    start(pmts.ol),end(pmts.ol)),
                       cut = genes.ol$cut[findOverlaps(pmts.ol, genes.ol,
                                                       select = "first")])

# integrate
dat.ex <- left_join(pmts.cut, mat.ex[,-c(1:3, 5, 6)]) %>%
  select(-V4) %>%
  arrange(cut, rowMeans(select(., -cut))) %>%
  split(., .$cut)

Heatmaps

We can plot the results as heatmaps again

# plot heatmap for each quintile
lapply(1:5, function(i) {
  levelplot(t(dat.ex[[i]][,2:601]), useRaster = T,
                           xlab = NULL, ylab = NULL, scales = list(draw = F),
                           col.regions = clrs, aspect = "fill",
                           at = seq(-2, 3, length.out = 100),
                           colorkey = F, margin = F)
}) %>% grid.arrange(grobs = ., ncol = 1)


Aggregate plots

Or alternatively take the column-wise average across said heatmap as a summary

lapply(dat.ex, function(x) colMeans(x[,2:601])) %>%
  melt() %>%
  rownames_to_column("row") %>%
  mutate(row = (as.numeric(row) - 1) %% (n()/5) + 1) %>%
  ggplot(aes(x = row, y = value, color = L1)) +
  annotate("rect", xmin = 200, xmax = 400, ymin = -Inf,
           ymax = Inf, alpha = 0.2, fill = "grey") +
  geom_line() +
  labs(y = "Enrichment", title = "H3K27ac in promoters",
       color = "Expression level") +
  scale_x_continuous(breaks = c(1, 200 * 1:3),
                     labels = c("-2kb", "Start", "End", "+2kb"),
                     expand = c(0, 0), name = "Position") +
  theme_bw() + theme(legend.position = "bottom")


Correlation

Or more directly we can just assess the relationship per-promoter

ols <- findOverlaps(pmts.ol, genes.ol, select = "first")
genes.ol$len <- abs(end(genes.ol) - start(genes.ol)) / 1000
fac <- sum(genes.ol$counts / genes.ol$len) / 1e6
pmts.cut$tpm <- (genes.ol$counts[ols] / genes.ol$len[ols]) / fac
dat.ex <- left_join(pmts.cut, mat.ex[,-c(1:3, 5, 6)]) %>%
  select(-c(V4, cut)) %>%
  mutate(rm = rowMeans(select(., 202:401))) %>%
  select(tpm, rm)
smoothScatter(x = log2(dat.ex$tpm + 1), y = dat.ex$rm,
              xlab = "Expression", ylab = "H3K27ac")


Normalization

We’ve been only looking at one sample thus far, and to contrast difference samples we’ll need to make sure the scales are meaningfully comparable


Aggregater plot

Taking our previous promoter aggregate plots, different methods would point to rather divergent conclusions

# define groups
ann <- tibble(f = names(mats)) %>%
  separate(f, c('samps', 'marks', 'norms', 'regs', NA, NA), '[_\\.]', F) 

# what to plot
mark <- 'H3K27ac'
reg <- 'promoter'

# label groups
dats <- melt(mats[ann$f[ann$marks == mark & ann$regs == reg]]) %>%
  rownames_to_column("row") %>%
  mutate(row = (as.numeric(row) - 1) %% (n()/6) + 1) %>%
  merge(dplyr::rename(ann, L1 = f)) %>%
  mutate(norms = factor(norms, levels = c('raw', 'inp', 'rx'))) %>%
  select(-L1)

# label axis
len <- nrow(dats) / 6 / 3
if (len * 10 < 1000) {
  lab.l <- sprintf("-%db", len)
  lab.r <- sprintf("+%db", len)
} else {
  lab.l <- sprintf("-%dkb", len / 100)
  lab.r <- sprintf("+%dkb", len / 100)
}

# plot
ggplot(dats, aes(x = row, y = value, color = samps)) +
  annotate("rect", xmin = len, xmax = len * 2, ymin = -Inf,
           ymax = Inf, alpha = 0.2, fill = "grey") +
  geom_line() +
  labs(y = "Enrichment", title = sprintf("%s @ %s", mark, reg)) +
  scale_x_continuous(breaks = c(1, len * 1:3),
                     labels = c(lab.l, "Start", "End", lab.r),
                     expand = c(0, 0), name = "Position") +
  facet_wrap(. ~ norms, scales = "free") + theme_bw()

  1. Do other regions or marks exhibit different trends?

Count in windows

As opposed to limiting ourselves to some set of pre-defined regions, we could also just take the counts in uniformly divided windows across the genome, which is stored cts object (a list of bedGraph files imported like below)

You don’t need to run this chunk in class

cts <- lapply(list.files('10kb', full.names = T), import.bedGraph) %>%
  setNames(sub('.10kb.bed', '', list.files('10kb')))

We now apply various normalization factors

# read in scaling factors
facs <- fread('https://hgen663.me/rx.csv')
condition mark hs_chip hs_input dm_chip dm_input
KO H3K9me3 56785959 34362619 1202438 1936047
KO H3K27ac 25379930 34362619 3671901 1936047
KO H3K36me2 46933048 34362619 428509 1936047
KO H3K27me3 46474170 34362619 3752386 1936047
K27M H3K9me3 60483553 30721591 1798155 1334441
K27M H3K27ac 35887148 30721591 2601095 1334441
K27M H3K36me2 65841940 30721591 785877 1334441
K27M H3K27me3 35038431 30721591 14848754 1334441
facs <- facs %>%
  mutate(sample = paste(facs$condition, facs$mark, sep = "_")) %>%
  rowwise() %>%
  mutate(ref = min(hs_chip, hs_input)) %>%
  ungroup() %>%
  mutate(depthfac1 = ref / hs_chip,
         depthfac2 = ref / hs_input,
         rx = (hs_chip / hs_input) / (dm_chip / dm_input),
         scalefac1 = rx * depthfac1)
condition mark hs_chip hs_input dm_chip dm_input sample ref depthfac1 depthfac2 rx scalefac1
KO H3K9me3 56785959 34362619 1202438 1936047 KO_H3K9me3 34362619 0.6051253 1.0000000 2.6607735 1.6101013
KO H3K27ac 25379930 34362619 3671901 1936047 KO_H3K27ac 25379930 1.0000000 0.7385913 0.3894297 0.3894297
KO H3K36me2 46933048 34362619 428509 1936047 KO_H3K36me2 34362619 0.7321625 1.0000000 6.1708992 4.5181011
KO H3K27me3 46474170 34362619 3752386 1936047 KO_H3K27me3 34362619 0.7393918 1.0000000 0.6978045 0.5159509
K27M H3K9me3 60483553 30721591 1798155 1334441 K27M_H3K9me3 30721591 0.5079330 1.0000000 1.4610526 0.7421168
K27M H3K27ac 35887148 30721591 2601095 1334441 K27M_H3K27ac 30721591 0.8560611 1.0000000 0.5992919 0.5130305
K27M H3K36me2 65841940 30721591 785877 1334441 K27M_H3K36me2 30721591 0.4665961 1.0000000 3.6391815 1.6980278
K27M H3K27me3 35038431 30721591 14848754 1334441 K27M_H3K27me3 30721591 0.8767970 1.0000000 0.1024968 0.0898689
# normalize
for (samp in facs$sample) {

  # get ratios
  rx <- facs$scalefac1[facs$sample == samp]
  r1 <- facs$depthfac1[facs$sample == samp]
  r2 <- facs$depthfac2[facs$sample == samp]
  
  # scale by factor
  num.input <- r1 * cts[[samp]]$score + 1
  num.rx <- rx * cts[[samp]]$score + 1
  denom <- r2 * cts[[sub('_.*', '_input', samp)]]$score + 1
  cts[[samp]]$rx <- log2(num.rx / denom)
  cts[[samp]]$input <- log2(num.input / denom)
}

And take H3K27me3 as an example

# compare counts
mark <- 'H3K27me3'

samp1 <- paste('KO', mark, sep = '_')
samp2 <- paste('K27M', mark, sep = '_')
smoothScatter(x = log2(cts[[samp1]]$score + 1),
              y = log2(cts[[samp2]]$score + 1),
              xlab = samp1, ylab = samp2, main = "Raw")
abline(0,1)


Or as MA plots

ma <- list()
ma[["raw"]] <- data.frame(m = log(cts[[samp1]]$score) - log(cts[[samp2]]$score),
                          a = 0.5 * (log(cts[[samp1]]$score) + log(cts[[samp2]]$score)))
ma[["input"]] <- data.frame(m = cts[[samp1]]$input - cts[[samp2]]$input,
                            a = 0.5 * (cts[[samp1]]$input + cts[[samp2]]$input))
ma[["rx"]] <- data.frame(m = cts[[samp1]]$rx - cts[[samp2]]$rx,
                            a = 0.5 * (cts[[samp1]]$rx + cts[[samp2]]$rx))

par(mfrow = c(1,3))
for(nm in names(ma)) {
  smoothScatter(x = ma[[nm]]$a, y = ma[[nm]]$m, ylab = "M", xlab = "A",
                main = sprintf("%s - %s (%s)", samp1, samp2, nm))
  abline(h = 0)
}

  1. Do the other marks also seem adequately normalized with any of the approaches?